#####
# Group Identities and Parliamentary Debates: Replication Package
# Fiva, Nedregård and Øien
# Last updated: Aptril 30, 2025
# Description: 
# - Cleans text data, and generates lemmatized speeched to be used in the analysis. 
# - Produces speeches_session_lemma.csv main data used in the analysis.
#####


# Packages

library(data.table)
library(stringr)
library(dplyr)
library(lubridate)


## Directories (The working directory is set by master.R in ./script)

txt.dir <- "../results/in_text"
raw.data.dir <- "../data/1_raw_data"
processed.data.dir <- "../data/2_processed_data"


# - - - - -  - - - - - - - - - - - - -    - - - -
# 0 Read data                                ----
# - - - - -  - - - - - - - - - - - - -    - - - -

## text data

DT <- fread(paste(raw.data.dir, "NPD.csv", sep = "/"), encoding="UTF-8") 

# Making a pid_session id, which is unique id variable for the unit of analysis

DT[, pid_session := paste(pid_v28, session, sep = "-")]

# Keeping only necessary varibles

DT <- DT[, .(pid_session, candidatename_ed = name, pid = pid_v28, id_speech, date, chamber, 
             text, language, session, election_year = election, party, elected, 
             deputy, cabinet, time, status, committee)]

DT[, text.list := str_split(text, " ")] # split the text by " "


## Meta data

meta <- fread(paste(raw.data.dir, "meta_1945_2021.csv", sep = "/"), encoding="UTF-8")

meta[, (names(meta)) := lapply(.SD, function(x) if (is.character(x)) ifelse(x == "", NA, x) else x)]



## Read lemmas

lemma.dt <- fread(paste(raw.data.dir, "lemmas.csv", sep = "/"))


# read stop words and procedural words

#procedural words
procedural <- fread(paste(raw.data.dir, "procedural_lemma.csv", sep = "/"))

procedural <- procedural$lemmas

#stop words
stop_words <- fread(paste(raw.data.dir, "stop_words_lemma.csv", sep = "/"))
stop_words <- stop_words$lemmas

#combine procedural and stop words
stop_words <- c(procedural, stop_words)

## MINUTES OF SPEECH


# Here I'm calculating minutes of speech and number of words for the speeches where
# I have the time stamp. This needs to be done on all speeches before sampling, or
# else the the number of minutes (difference in minutes between start of speech
# and the beginning of next speech) will be wrong

# Lubridate does not work well with data.table, so I use dplyr
dt.min <- DT |>
  arrange(id_speech) |>
  mutate(time = case_when(time == "" ~ NA_character_, T ~ time)) |>
  group_by(date) |>
  filter(!all(is.na(time))) |>
  ungroup()

dt.min <- dt.min |>
  mutate(datetime = as.POSIXct(paste(date, time), format = "%Y-%m-%d %H:%M:%S")) |>
  group_by(date) |>
  mutate(min := interval(datetime, lead(datetime))/dminutes(1))

## Return to data.table

setDT(dt.min)
## I set above 2 hours to missing, because I don't manage to distinguish between two debates
## on the same date. There are only (1 - ecdf(dt[!is.na(min)]$min)(120))*100 = .03 % of observations
dt.min[min >= 120, min := NA] 
dt.min[min <= 0, min := NA] # Nobody can talk for non-positive minues, set to missing
# Drop missing minutes and merge to get number of words of speeches with minutes
dt.min <- dt.min[!is.na(min)]
dt.min <- dt.min[, .(id_speech, min)]

DT <- merge(DT, dt.min, by = "id_speech", all.x = T)



# - - - - -  - - - - - - - - - - - - ------- - - -
# 1 Sample restrictions --------------------------
# - - - - -  - - - - - - - - - - - - -    - - - -

DT <- DT[status != "not matched" & election_year %in% seq(1981, 2017, 4)]


#write n obs full sample
n <- format(nrow(DT), big.mark=",", scientific=F)
writeLines(n, paste(txt.dir, "n_full.txt", sep = "/"))

#remove lag- og odelsting

#remove odelstinget and lagtinget
n <- format(nrow(DT[chamber %in% c("lagtinget","odelstinget"),]), big.mark=",", scientific=F) #write n
writeLines(n, paste(txt.dir, "n_lag_odel.txt", sep = "/"))
DT <- DT[!chamber %in% c("lagtinget","odelstinget"),]

#remove deputy
n <- format(nrow(DT[status=="Deputy",]), big.mark=",", scientific=F) 
writeLines(n, paste(txt.dir, "n_deputy.txt", sep = "/"))
DT <- DT[!status=="Deputy"]

#remove president
n <- format(nrow(DT[status=="President",]), big.mark=",", scientific=F) #write n
writeLines(n, paste(txt.dir, "n_president.txt", sep = "/"))
DT <- DT[!status=="President",]

#remove cabinet and unelected
n <- format(nrow(DT[elected == 0 | cabinet == 1]), big.mark=",", scientific=F) #write n
writeLines(n, paste(txt.dir, "n_cabinet_unelected.txt", sep = "/"))
DT <- DT[elected==1]
DT <- DT[cabinet == 0]

#Remove nynorsk speakers----
#Remove MPs who uses Nynorsk in more than 50 percent of their speeches

DT[, nn.sh := mean(language=="Nynorsk"), by = "pid"]
id_nn_speakers <- unique(DT[nn.sh > 0.5]$pid)
n <- dim(DT[pid %in% id_nn_speakers])[1]
n <- format(n, big.mark=",", scientific=F)
writeLines(n, paste(txt.dir, "n_nn_speakers.txt", sep = "/"))

# remove predominately nynorsk speakers

DT <- DT[!pid %in% id_nn_speakers]

#write n nynorsk speeches
n <- format(nrow(DT[language %in% c("Nynorsk")]), big.mark=",", scientific=F) #write n
writeLines(n, paste(txt.dir, "n_nn_speeches.txt", sep = "/"))

#remove remaining nn speeches
DT <- DT[! language %in% c("Nynorsk")]

DT[, nn.sh := NULL]

# Keep only main parties.

n <- nrow(DT[!(party %in% c("FrP", "H", "KrF", "Sp", "V", "A", "SV")),])

n <- format(n, big.mark=",", scientific=F)
writeLines(n, paste(txt.dir, "n_non_core_parties.txt", sep = "/"))

DT <- DT[party %in% c("FrP", "H", "KrF", "Sp","V", "A", "SV")]

# Remove MPS who switch party (mid session)

##   Drop Dag Danielsen (pid == 7375) 2000-2001 /*left FRP in April 2001 */
##   Drop Fridtjof Frank Gundersen (pid == 10919) 2000-2001 /*left FRP in April 2001*/
##   Drop Inger Stolt Nielsen (pid == 16557) 2000-2001 /*left Høyre in 2001*/
##   Drop Jørn L Stang (pid == 20499) 2000-2001 /*left FRP in April 2001*/
##   Drop Terje Knudsen (pid == 38169) 2000-2001 /*left FRP in April 2001*/
##   Drop Vidar Kleppe (pid == 40814) 2000-2001 /*left FRP in April 2001*/
##   Drop Ulf Leirstein () 2019-2020 /*left FRP in April 2019 */
##   Drop Ulf Leirstein (pid == 40431) 2019-2020 /*left FRP in April 2019 */
##   Drop Ulf Leirstein (pid == 40431) 2020-2021 /*Party-independent in this session */

DT <- DT[!(pid == 7375  & session == "2000-2001")]
DT <- DT[!(pid == 10919 & session == "2000-2001")]
DT <- DT[!(pid == 16557 & session == "2000-2001")]
DT <- DT[!(pid == 20499 & session == "2000-2001")]
DT <- DT[!(pid == 38169 & session == "2000-2001")]
DT <- DT[!(pid == 40814 & session == "2000-2001")]
DT <- DT[!(pid == 40431 & session %in% c("2019-2020", "2020-2021")),]

DT[pid == 18302 & session == "2020-2021", party := "Sp"] #Recode Jan Bøhler as “SP” for 2020-2021 /* Switch from AP to SP in October 2020 */


# - - - - -  - - - - - - - - - - - - -    - - - -
# 2 String cleaning and lemmatize----
# - - - - -  - - - - - - - - - - - - -    - - - -

DT[, text_orig := text]

##  remove all punctuation, numbers, symbols and parentheses
DT[, text := str_replace_all(text, "\\([^()]*\\)", "")] ### Removing parantheses and everything within parantheses
DT[, text := str_replace_all(text, "[^\\p{L}\\s]", "")] # Removes everything that is not a letter or whitespace.
DT[, text := str_squish(text)] # remove leading, trailing and double spaces
DT <- DT[text != ""] #remove speeches that are left blank after removing numbers, punctuation

## Make a dataset with one row per word and speech to lemmatize

DT_word <- DT[, .(id_speech, session, text)]

DT_word[, text.list := str_split(text, " ")] # split the text by " "
DT_word <- DT_word[, .(text = unlist(text.list)), by = c("id_speech", "session")]

## Remove MP names

DT[, candidatename_ed := str_replace_all(candidatename_ed, "[^\\p{L}\\s]", "")]

DT[, name.list := str_split(candidatename_ed, " ")]
names_list <- unique(unlist(DT$name.list))
names_list <- names_list[sapply(names_list, str_count) > 1]
DT[, name.list := NULL]

remove_names <- c(names_list, paste0(names_list, "s")) # also remove genitive


DT_word <- DT_word[!text %in% remove_names]


DT_word[, text := tolower(text)] # make lower case



## merge with lemma


DT_word  <- merge(DT_word, lemma.dt, by = "text", sort = FALSE)


#remove party acronyms and party names

party_acronym <- c("ap","sp", "v", "h", "krf", "sv", "frp")

parties <- c("arbeiderpart", "senterpart",
             "venstre",  "høyre", "kristelig", "folkeparti", 
             "sosialistisk venstreparti", "fremskrittsparti")

DT_word[, lemmas := str_replace_all(lemmas, paste0("\\b", paste0(party_acronym, 
                                                             collapse = "\\b|\\b"), 
                                               "\\b"), "")]

DT_word[, lemmas := str_replace_all(lemmas, paste0("(?<=^| )", parties, ".*?(?=$| )", 
                                                   collapse = "|"), "")]

# - - - - - - - - - - - - - -  - - - - - - - -  -
#standardize all words starting in "fram" with "frem" (e.g., fremtid/framtid)

DT_word[, lemmas := gsub("\\bfram", "frem", lemmas)]

DT_word <- DT_word[lemmas != ""]

## Remove stop words and procedural lemmas

DT_word <- DT_word[!lemmas %in% stop_words]


## Drop low frequent lemmas

## Retain words that occur more than ten times in at least
## one parliamentary session, spoken in at least ten unique speaker-sessions, and spoken
## at least twenty times across all sessions

DT_word[, n.total := .N, by = "lemmas"] # spoken at least 20 times
DT_word[, n.by.session := .N, by = c("lemmas", "session")] 
DT_word[, n.sessions.per.word := length(unique(session)), by = "lemmas"] 

lemmas_min_20 <- unique(DT_word[n.total < 20]$lemmas)

DT_word[, sum.10.by.session := sum(n.by.session >= 10), by = "lemmas"]
lemmas_min_10_per_session <- unique(DT_word[sum.10.by.session == 0]$lemmas)

lemmas_min_10_sessions <- unique(DT_word[n.sessions.per.word < 10]$lemmas)

lemmas_to_remove <- unique(c(lemmas_min_10_per_session, 
                             lemmas_min_20, 
                             lemmas_min_10_sessions))


DT_word <- DT_word[!lemmas %in% lemmas_to_remove]


words_to_analyze <- unique(DT_word$lemmas)

n <- format(length(words_to_analyze), big.mark=",", scientific=F) #write n
writeLines(n, paste(txt.dir, "n_words.txt", sep = "/"))


## Drop text variable and replace with lemmatized pre-processed data

DT[, text := NULL]

temp <- DT_word[, .(text         = paste(lemmas, collapse = " "),
                    n_words_only = .N), by = "id_speech"]
DT <- merge(DT, temp, by = "id_speech")
rm(temp, DT_word)

# - - - - -  - - - - - - - - - - - - ------ - - -
# 3 Making and attaching meta data    ----
# - - - - -  - - - - - - - - - - - - -    - - - -

## COMMITTEE DUMMIES

# Create a dataset that provides an overview of the policy committees included 
# in our sample and used in the analysis. 
# Non-policy committees are excluded.

df <- data.frame(
  committee_dummy_name = c(
    "arb_sos", "energi_miljo", "energi_industri", "familie_kultur_admin",
    "fam_kult", "finans", "forbruker_admin", "forsvar", "helse_omsorg",
    "justis", "kirke_undervisning", "kirke_utdanning_forskning",
    "kom_forvalt", "kom_miljo", "kommunal", "kontroll_konst",
    "landbruk", "naering", "samferdsel", "sjofart_fisk",
    "sosial", "trans_kom", "utd_forsk", "utenriks_konstitusjon",
    "uten_forsvar", "utenriks"
  ),
  committee_name = c(
    "Arbeids- og sosialkomiteen", "Energi- og Miljøkomiteen",
    "Energi- og industrikomiteen",  "Fam.- Kultur- og adm.komiteen",
    "Familie- og kulturkomiteen", "Finanskomiteen",
    "Forbruker- og administrasjonsko", "Forsvarskomiteen",
    "Helse- og omsorgskomiteen", "Justiskomiteen",
    "Kirke- og undervisningskomiteen", "Kirke- utdannings- og forskningskomiteen",
    "Kommunal- og forvaltningskomite", "Kommunal- og miljøvernkomiteen",
    "Kommunalkomiteen", "Kontroll- og konstitusjonskomit", "Landbrukskomiteen",
    "Næringskomiteen", "Samferdselskomiteen", "Sjøfart og fiskerikomiteen",
    "Sosialkomiteen", "Transport- og kommunikasjonskom",
    "Utdannings- og forskningskomite", "Utenriks og konstit.komiteen",
    "Utenriks- og forsvarskomiteen", "Utenrikskomiteen"
  ),
  stringsAsFactors = FALSE
)  |>
  mutate(policy_com = case_when(committee_dummy_name %in% 
                                  c("arb_sos", "helse_omsorg", 
                                    "forbruker_admin", "sosial") ~ "Labor, Health, and Social Affairs",
                                committee_dummy_name %in% c("finans") ~ "Finance", 
                                committee_dummy_name %in% c("energi_industri", "energi_miljo") ~ "Energy, Environment and Industry",
                                committee_dummy_name %in% c("fam_kult", "familie_kultur_admin") ~ "Family and Cultural Affairs",
                                committee_dummy_name %in% c("justis") ~ "Justice",
                                committee_dummy_name %in% c("kontroll_konst") ~ "Scrutiny and Const. Affairs",
                                committee_dummy_name %in% c("kom_forvalt", "kommunal", "kom_miljo") ~ "Local Government",
                                committee_dummy_name %in% c("naering", "landbruk", "sjofart_fisk") ~ "Business and Industry",
                                committee_dummy_name %in% c("trans_kom", "samferdsel") ~ "Transport and Communications",
                                committee_dummy_name %in% c("utd_forsk", "kirke_utdanning_forskning", "kirke_undervisning") ~ "Education and Research",
                                committee_dummy_name %in% c("utenriks_konstitusjon", "utenriks", "forsvar", "uten_forsvar") ~ "Foreign Affairs and Defence",
                                T ~ NA_character_
  ))

# looking only at people with commmittee

npd <- DT[!is.na(committee)] 
npd[, committee := gsub("-(\\s*),", "-", committee)] # Fixing such that committee can be split by comma
npd[, c   := str_split(committee, ",")] # Making a new list variable with all committee names
npd[, c   := lapply(c, str_trim)]
unique_com_names_npd <- unique(unlist(npd$c)) # all committee names in the data
com.dt <- data.frame(committee = str_trim(unique_com_names_npd[!is.na(unique_com_names_npd)]))
setDT(com.dt)
com.dt <- merge(com.dt, df, by.x = "committee", by.y = "committee_name", all.x = T)

# Those that have NA in committee_dummy_name are not in a policy committees and should
# be removed:

# com.dt[is.na(committee_dummy_name)]$committee
# [1] "Den forber. Fullmaktskomité"      "Den forberedende fullmaktskomité" "Den utvidede utenrikskomité"     
# [4] "Den utvidete utenriks- og forsv"  "Fullmaktskomiteen"                "Valgkomiteen"    

com.dt <- com.dt[!is.na(committee_dummy_name)]


## Making a list variable of committee assignments for each speech
npd[, com := lapply(c, function(x) {com.dt$committee_dummy_name[com.dt$committee %in% x]})]
npd[, com := lapply(com, function(x) if (length(x) == 0) NA_character_ else x)]

## same for policy com name


npd.session <- npd[, .(name = unique(candidatename_ed),
                       session = unique(session),
                       com = list(c(unlist(com))) # This creates a vector for 
                       # committee assignments for all speeches in by pid_session
                       
), by = "pid_session"]

npd.session[, t.com := lapply(com, function(x) sort(table(x)))]
## We use the committees where majority of speeches are made
npd.session[, t.com.max := lapply(t.com, function(x) {
  if (is.null(x) || length(x) == 0 || all(is.na(x))) return(NA_character_) 
  names(x[x == max(x, na.rm = TRUE)])
})] # t.com.max are the committee memberships where most speeches are made 

dummies <- npd.session[, .(c = unlist(t.com.max)), by = "pid_session"]
dummies <- dummies[!is.na(c)]
dummies <- dcast(dummies, pid_session ~ c, length, fill = 0)


# Merge on t.com.max (majority committee membership) to be used in appendix figures:

dummies <- merge(dummies, npd.session[, .(pid_session, committee = t.com.max)], by = "pid_session",
                 all.x = T)

## Add policy committee grouping to be used in appendix figures.


dummies[, policy_com := lapply(committee, function(x) {com.dt$policy_com[com.dt$committee_dummy_name %in% x]})]
dummies[, policy_com := lapply(policy_com, function(x) if (length(x) == 0) NA_character_ else x)]

dummies[, committee := sapply(committee, function(x) paste(x, collapse = "| "))]
dummies[, policy_com := sapply(policy_com, function(x) paste(x, collapse = "| "))]


DT[, committee := NULL]
DT[, text.list := NULL]


#fwrite(DT, file = paste(processed.data.dir, "speeches_pre_aggregation.csv", sep = "/"), 
#       encoding = "UTF-8")

## Collapsing to the speakerXsession level

DT <- DT[, .(candidatename_ed = unique(candidatename_ed),
             pid              = unique(pid),
             election_year    = unique(election_year),
             session          = unique(session),
             text             = paste(text, collapse = " "),
             n_words_only     = sum(n_words_only),
             n.speeches       = length(unique(id_speech)),
             sum.min          = sum(min, na.rm = T),
             sum.min.words    = as.double(sum(ifelse(!is.na(min), n_words_only, 0), na.rm = T))), 
         by = "pid_session"]

## Set zero to NA for sum.min and sum.min.words

DT[sum.min <=0, sum.min := NA]
DT[sum.min.words <=0, sum.min.words := NA]


## merge meta

meta <- meta[pid %in% unique(DT$pid)]


DT <- merge(DT, meta[, .(pid, election_year, female, birthyear, bloc, 
                         town, occupation,
                         age_cat, party)], by = c("pid", "election_year"))

## Merge on committee dummies and committee variable to be used in appendix figures

## which in observations in DT not in dummies (they are dropped because they don't have committee information)

n <- format(length(unique(DT$pid)[!unique(DT$pid_session) %in% unique(dummies$pid_session)]), big.mark=",", scientific=F) #write n
writeLines(n, paste(txt.dir, "n_missing_committee.txt", sep = "/"))

DT <- merge(DT, dummies, by = "pid_session")

# Number of observations and number of unique speakers

# write number of speaker-sessions
n <- format(length(unique(DT$pid_session)), big.mark=",", scientific=F) #write n
writeLines(n, paste(txt.dir, "tot_speakers.txt", sep = "/"))

#write number of MPs
n <- format(length(unique(DT$pid)), big.mark=",", scientific=F) #write n
writeLines(as.character(n), paste(txt.dir, "tot_mp.txt", sep = "/"))

# - - - - -  - - - - - - - - - - - - -    - - - -
# 4 Save analysis data -------------------- ----
# - - - - -  - - - - - - - - - - - - -    - - - -


fwrite(DT, file = paste(processed.data.dir, "speeches_session_lemma.csv", sep = "/"), 
       encoding = "UTF-8")











